Import Libraries

library("jsonlite")
library("ggplot2")
library("readr")
setwd("~/Documents/Project/YelpAnalysis/")

Import Business Dataset

business <- "dataset/business.json"
review <-read_lines(business, n_max = 200000)
business.df <- fromJSON(paste("[", paste(review, collapse = ","), "]"))
business.df <- flatten(business.df)
business.df

Investigate variable correlations with MCMC regression

Encode all the caterogries

# one-hot-encode all the categories
library(qdapTools)
categories <- mtabulate(business.df$categories)
categories

create a count dataframe to count the frequency of each unique category and then sum them, Keep the categories with the top 15 frequency

# create a count dataframe to count the frequency of each unique category and then sum them
categories.sum <- data.frame(
                 category=character(),
                 counts=integer(),
                 stringsAsFactors=FALSE)
for (name in names(categories)) {
  categories.sum[nrow(categories.sum) + 1,] = list(name, sum(categories[name]))
}
categories.sum
# Keep the top 15 categories
eliminated.categories <- categories.sum[order(categories.sum$counts, decreasing = TRUE),]
eliminated.categories <- eliminated.categories[1:15,]
eliminated.categories
# remove all the column that is not the top 15 category
library(dplyr)
categories.filtered <- categories[eliminated.categories$category]

merge the categories back to business dataframe, Only keep the target column, the explanatory columns(categories column) and business id and name

categories.filtered$ID <- seq.int(nrow(categories.filtered))
business.df$ID <- seq.int(nrow(business.df))
encoded.business.df <- left_join(business.df[,c("ID", "business_id", "name", "stars")], categories.filtered)
Joining, by = "ID"
encoded.business.df

Apply MCMC Linear Regression

plot(lm1)
Error in plot(lm1) : object 'lm1' not found
LS0tCnRpdGxlOiAiWWVscCBEYXRhc2V0IEFuYWx5c2lzIC0gcGFydDIiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMjIyBJbXBvcnQgTGlicmFyaWVzCmBgYHtyfQpsaWJyYXJ5KCJqc29ubGl0ZSIpCmxpYnJhcnkoImdncGxvdDIiKQpsaWJyYXJ5KCJyZWFkciIpCnNldHdkKCJ+L0RvY3VtZW50cy9Qcm9qZWN0L1llbHBBbmFseXNpcy8iKQpgYGAKCiMjIyBJbXBvcnQgQnVzaW5lc3MgRGF0YXNldApgYGB7cn0KYnVzaW5lc3MgPC0gImRhdGFzZXQvYnVzaW5lc3MuanNvbiIKcmV2aWV3IDwtcmVhZF9saW5lcyhidXNpbmVzcywgbl9tYXggPSAyMDAwMDApCmJ1c2luZXNzLmRmIDwtIGZyb21KU09OKHBhc3RlKCJbIiwgcGFzdGUocmV2aWV3LCBjb2xsYXBzZSA9ICIsIiksICJdIikpCmJ1c2luZXNzLmRmIDwtIGZsYXR0ZW4oYnVzaW5lc3MuZGYpCmJ1c2luZXNzLmRmCmBgYAoKIyMjIEludmVzdGlnYXRlIHZhcmlhYmxlIGNvcnJlbGF0aW9ucyB3aXRoIE1DTUMgcmVncmVzc2lvbgoKIyMjIyBFbmNvZGUgYWxsIHRoZSBjYXRlcm9ncmllcwpgYGB7cn0KIyBvbmUtaG90LWVuY29kZSBhbGwgdGhlIGNhdGVnb3JpZXMKbGlicmFyeShxZGFwVG9vbHMpCmNhdGVnb3JpZXMgPC0gbXRhYnVsYXRlKGJ1c2luZXNzLmRmJGNhdGVnb3JpZXMpCmNhdGVnb3JpZXMKYGBgCgojIyMjIGNyZWF0ZSBhIGNvdW50IGRhdGFmcmFtZSB0byBjb3VudCB0aGUgZnJlcXVlbmN5IG9mIGVhY2ggdW5pcXVlIGNhdGVnb3J5IGFuZCB0aGVuIHN1bSB0aGVtLCBLZWVwIHRoZSBjYXRlZ29yaWVzIHdpdGggdGhlICB0b3AgMTUgZnJlcXVlbmN5CmBgYHtyfQojIGNyZWF0ZSBhIGNvdW50IGRhdGFmcmFtZSB0byBjb3VudCB0aGUgZnJlcXVlbmN5IG9mIGVhY2ggdW5pcXVlIGNhdGVnb3J5IGFuZCB0aGVuIHN1bSB0aGVtCmNhdGVnb3JpZXMuc3VtIDwtIGRhdGEuZnJhbWUoCiAgICAgICAgICAgICAgICAgY2F0ZWdvcnk9Y2hhcmFjdGVyKCksCiAgICAgICAgICAgICAgICAgY291bnRzPWludGVnZXIoKSwKICAgICAgICAgICAgICAgICBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFKQoKZm9yIChuYW1lIGluIG5hbWVzKGNhdGVnb3JpZXMpKSB7CiAgY2F0ZWdvcmllcy5zdW1bbnJvdyhjYXRlZ29yaWVzLnN1bSkgKyAxLF0gPSBsaXN0KG5hbWUsIHN1bShjYXRlZ29yaWVzW25hbWVdKSkKfQpjYXRlZ29yaWVzLnN1bQoKIyBLZWVwIHRoZSB0b3AgMTUgY2F0ZWdvcmllcwplbGltaW5hdGVkLmNhdGVnb3JpZXMgPC0gY2F0ZWdvcmllcy5zdW1bb3JkZXIoY2F0ZWdvcmllcy5zdW0kY291bnRzLCBkZWNyZWFzaW5nID0gVFJVRSksXQplbGltaW5hdGVkLmNhdGVnb3JpZXMgPC0gZWxpbWluYXRlZC5jYXRlZ29yaWVzWzE6MTUsXQplbGltaW5hdGVkLmNhdGVnb3JpZXMKCiMgcmVtb3ZlIGFsbCB0aGUgY29sdW1uIHRoYXQgaXMgbm90IHRoZSB0b3AgMTUgY2F0ZWdvcnkKbGlicmFyeShkcGx5cikKY2F0ZWdvcmllcy5maWx0ZXJlZCA8LSBjYXRlZ29yaWVzW2VsaW1pbmF0ZWQuY2F0ZWdvcmllcyRjYXRlZ29yeV0KYGBgCgojIyMjIG1lcmdlIHRoZSBjYXRlZ29yaWVzIGJhY2sgdG8gYnVzaW5lc3MgZGF0YWZyYW1lLCBPbmx5IGtlZXAgdGhlIHRhcmdldCBjb2x1bW4sIHRoZSBleHBsYW5hdG9yeSBjb2x1bW5zKGNhdGVnb3JpZXMgY29sdW1uKSBhbmQgYnVzaW5lc3MgaWQgYW5kIG5hbWUKYGBge3J9CmNhdGVnb3JpZXMuZmlsdGVyZWQkSUQgPC0gc2VxLmludChucm93KGNhdGVnb3JpZXMuZmlsdGVyZWQpKQpidXNpbmVzcy5kZiRJRCA8LSBzZXEuaW50KG5yb3coYnVzaW5lc3MuZGYpKQplbmNvZGVkLmJ1c2luZXNzLmRmIDwtIGxlZnRfam9pbihidXNpbmVzcy5kZlssYygiSUQiLCAiYnVzaW5lc3NfaWQiLCAibmFtZSIsICJzdGFycyIpXSwgY2F0ZWdvcmllcy5maWx0ZXJlZCkKZW5jb2RlZC5idXNpbmVzcy5kZgpgYGAKCiMjIyMgQXBwbHkgTUNNQyBMaW5lYXIgUmVncmVzc2lvbgpgYGB7cn0KbGlicmFyeShNQ01DcGFjaykKbG0xIDwtIE1DTUNyZWdyZXNzKHN0YXJzIH4gLiAtIElEIC0gYnVzaW5lc3NfaWQgLSBuYW1lLCBlbmNvZGVkLmJ1c2luZXNzLmRmKQpwbG90KGxtMSkKc3VtbWFyeShsbTEpCnN0ZXAobG0xKQpzdGVwKGxtMSwgaz1sb2cobnJvdyhlbmNvZGVkLmJ1c2luZXNzLmRmKSkpIApgYGAKCg==